eds_222_final_project_polished

Libraries

#loading the necessary libraries
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   0.3.5
✔ tibble  3.1.8     ✔ stringr 1.5.0
✔ tidyr   1.2.1     ✔ forcats 0.5.1
✔ readr   2.1.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(here)
here() starts at /Users/colleenmccamy/Documents/MEDS/EDS_222_Stats/final_project/tou-energy-analysis
library(readr)
library(gt)
library(tufte)
library(feasts)
Loading required package: fabletools
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(lubridate)

Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(broom)
library(tsibble)

Attaching package: 'tsibble'

The following object is masked from 'package:lubridate':

    interval

The following objects are masked from 'package:base':

    intersect, setdiff, union
library(ggpubr)
library(ggiraph)
library(ggiraphExtra)
library(sjPlot)
Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(ggcorrplot)
library(car)
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:purrr':

    some

The following object is masked from 'package:dplyr':

    recode
library(modelr)

Attaching package: 'modelr'

The following object is masked from 'package:broom':

    bootstrap

Loading in the Data & Data Wrangling

# setting my root directory
rootdir <- ("/Users/colleenmccamy/Documents/MEDS/EDS_222_Stats/final_project")

# reading in the data
eia_data_raw <- read_csv(paste0(rootdir, "/data/eia_data.csv"))
Warning: One or more parsing issues, see `problems()` for details
Rows: 35436 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): region, region_name, system_operator, stystem_operator_name
dbl  (1): hourly_energy_mwh
dttm (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# cleaning the data to be the two variables of interest
eia_df <- eia_data_raw |> 
  select(date, hourly_energy_mwh) |> 
  na.omit()
  
# creating a time series dataframe
eia_ts <- eia_df |> 
  as_tsibble()
Using `date` as index variable.

Adding Temperature Data

# loading in the temperature data
temp_data <- read_csv(paste0(rootdir, "/data/sd_temp_data.csv"))
Rows: 1522 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Date, temp_max, temp_avg, temp_dept
dbl (1): temp_min

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# wrangling the data
temp_data <- temp_data |> 
  mutate(temp_max = as.numeric(temp_max)) |> 
  mutate(temp_min = as.numeric(temp_min)) |> 
  mutate(temp_avg = as.numeric(temp_avg)) |> 
  mutate(temp_dept = as.numeric(temp_dept)) |> 
  mutate(date = lubridate::mdy(Date)) |> 
  select(!Date)
Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

Merging the Data

# restructuring the eia data to merge the dataset with the temperature data by date
eia_data <- eia_df |> 
  mutate(time = (date)) |> 
  mutate(date = as.Date(date))
eia_data$time <- format(eia_data$time, format = "%H:%M:%S")

# merging the data into one dataframe
energy_temp_df <- left_join(x = eia_data,
                            y = temp_data,
                            by = "date")

Grouping Data by Day

# creating dataframe for tou peak horus
tou_peak_hours_df <- energy_temp_df |> 
  filter(time >= 16 & time <= 21)

# grouping it for daily peak hours to plot with daily maximum temperature
daily_peak_hrs_df <- tou_peak_hours_df |> 
   group_by(date) |> 
   summarize(daily_energy_mwh = sum(hourly_energy_mwh))

Basic Visualization

# exploring the data by plotting energy demand throughout time
energy_demand_plot <- ggplot(data = eia_df,
       aes(x = date, 
           y = hourly_energy_mwh)) +
  geom_line(col = "#b52b8c") +
  labs(title = "Hourly Energy Demand (MWh)",
       x = "Date",
       y = "MWh") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# plotting daily peak energy demand with daily max temperatures
peak_demand_plot <- ggplot(data = daily_peak_hrs_df,
       aes(x = date, 
           y = daily_energy_mwh)) +
  geom_line(col = "#b52b8c") +
  labs(title = "Hourly Energy Demand (MWh)",
       x = "Date",
       y = "MWh") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# exploring the data by plotting maximum temperature throughout time
max_temp_plot <- ggplot(temp_data, aes(x = date, y = temp_max)) + 
  geom_line(col = "#52796f") +
  labs(title = "Maximum Temperature per day (°F)",
       x = "Date",
       y = "Max Temperature (°F)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# plotting along with daily temperature
ggarrange(peak_demand_plot, max_temp_plot,
                    ncol = 2, nrow = 1)

Visualizing Daily Demand for Peak Hours

# peak_demand_plot <- ggplot(data = daily_peak_hrs_df,
#        aes(x = date, 
#            y = daily_energy_mwh)) +
#   geom_line(col = "#b52b8c") +
#   labs(title = "Hourly Energy Demand (MWh)",
#        x = "Date",
#        y = "MWh") +
#   theme_minimal() +
#   theme(plot.title = element_text(hjust = 0.5))
# peak_demand_plot
# 
# View(test)
# # plotting along with daily temperature
# ggarrange(peak_demand_plot, max_temp_plot,
#                     ncol = 2, nrow = 1)

Determining a “Hot Day”

### ---- Determining a "Hot Day" ---- 

# determining the mean and standard deviation for the time period of interest
mean_max_temp <- mean(energy_temp_df$temp_max, na.rm = TRUE)
sd_max_temp <- sd(energy_temp_df$temp_max, na.rm = TRUE)

print(mean_max_temp)
[1] 72.02147
print(sd_max_temp)
[1] 6.934419
# preparing the data to plot
box_data <- as_tibble(energy_temp_df$temp_max)

# plotting the mean and standard deviation
temp_box <- ggplot(box_data) +
  geom_boxplot(aes(x = value)) +
  labs(x = "Maximum Daily Temperature (°F)") +
  theme_minimal()

temp_box
Warning: Removed 48 rows containing non-finite values (`stat_boxplot()`).

Adding a ‘Hot Day’ Indicator in the Dataframe

temp_demand_daily <- energy_temp_df |> 
  group_by(date) |> 
  summarize(daily_energy_mwh = sum(hourly_energy_mwh)) |> 
  left_join(temp_data, by = "date") |> 
  mutate(hot_day = case_when(
    (temp_max >= 80) ~ 1,
    (temp_max <= 79) ~ 0))

Determining Temperature’s Effect

# 
# # running a simple linear regression
# model_hot_demand <- lm(formula = daily_energy_mwh ~ hot_day, 
#    data = energy_temp_df)
# summary(model_hot_demand)
# 
# # conducting a t-test
# ttest_temp_demand <- t.test(daily_energy_mwh ~ hot_day, 
#                             data = temp_demand_daily)
# ttest_temp_demand

Adding TOU Policy and Peak Hours to Dataframe

# adding a year separate year column in the dataframe
energy_temp_df <- energy_temp_df |> 
  mutate(year = date)

energy_temp_df$year <- format(energy_temp_df$year, format = "%Y") 

# adding binary variables
energy_temp_df <- energy_temp_df |> 
  mutate(tou_policy = case_when(
    (year > 2020) ~ 1,
    (year <= 2020) ~ 0)) |> 
  mutate(time = as_datetime(time, format = "%H:%M:%S")) |> 
  mutate(time = lubridate::hour(time)) |> 
  mutate(tou_policy = case_when(
    (year > 2020) ~ 1,
    (year <= 2020) ~ 0)) |> 
  mutate(peak_hours = case_when(
    (time < 16) ~ 0,
    (time >= 16 & time <= 21 ) ~ 1,
    (time > 21) ~0)) |> 
  mutate(hot_day = case_when(
    (temp_max >= 80) ~ 1,
    (temp_max <= 79) ~ 0))

Linear Model

#### ----- Linear Regression on Hourly Energy Demand ---- ###
model_tou_peak_demand <- lm(formula = hourly_energy_mwh ~ 
                              tou_policy + 
                              peak_hours +
                              hot_day, 
                            data = energy_temp_df)

summary(model_tou_peak_demand)

Call:
lm(formula = hourly_energy_mwh ~ tou_policy + peak_hours + hot_day, 
    data = energy_temp_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1472.31  -291.28   -49.13   278.97  2018.97 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 2164.025      3.313  653.16   <2e-16 ***
tou_policy  -107.822      4.628  -23.30   <2e-16 ***
peak_hours  -360.175      5.176  -69.59   <2e-16 ***
hot_day      409.103      6.445   63.47   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 425.1 on 36002 degrees of freedom
  (48 observations deleted due to missingness)
Multiple R-squared:  0.2128,    Adjusted R-squared:  0.2127 
F-statistic:  3244 on 3 and 36002 DF,  p-value: < 2.2e-16
# results
tab_model(model_tou_peak_demand,
          pred.labels = c("Intercept", 
                          "TOU Policy In Effect", 
                          "During Peak Hours", 
                          "Max. Temp above 80 (°F)"),
          dv.labels = c("Hourly Electricity Demand (MWh)"),
          string.ci = "Conf. Int (95%)",
          string.p = "P-value",
          title = "Table 1. Linear Model Results for Predictors on Hourly Electricity Demand",
          digits = 0)
Table 1. Linear Model Results for Predictors on Hourly Electricity Demand
  Hourly Electricity Demand (MWh)
Predictors Estimates Conf. Int (95%) P-value
Intercept 2164 2158 – 2171 <0.001
TOU Policy In Effect -108 -117 – -99 <0.001
During Peak Hours -360 -370 – -350 <0.001
Max. Temp above 80 (°F) 409 396 – 422 <0.001
Observations 36006
R2 / R2 adjusted 0.213 / 0.213

Conducting a QQ Plot of residual for energy demand

# creating residuals from the model
aug <- energy_temp_df |>  
  add_predictions(model_tou_peak_demand) |> 
  mutate(residuals_energy = hourly_energy_mwh - pred)

#plotting the residuals
qqPlot(aug$residuals_energy) 

[1] 16604 16605

Plotting the Linear Regression

ggPredict(model_tou_peak_demand, 
          jitter = TRUE, 
          interactive = TRUE)

Linear Regression with Interation Added

# conducting the model
model_int_tou_peak_demand <- lm(formula = hourly_energy_mwh ~ 
                              tou_policy + 
                              peak_hours +
                              hot_day + 
                              peak_hours * tou_policy,
                            data = energy_temp_df)
# getting the output of the model
summary(model_int_tou_peak_demand)

Call:
lm(formula = hourly_energy_mwh ~ tou_policy + peak_hours + hot_day + 
    peak_hours * tou_policy, data = energy_temp_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1487.56  -292.30   -51.44   279.56  2028.45 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)           2154.551      3.462 622.296   <2e-16 ***
tou_policy             -83.108      5.331 -15.590   <2e-16 ***
peak_hours            -322.250      6.583 -48.955   <2e-16 ***
hot_day                409.116      6.438  63.552   <2e-16 ***
tou_policy:peak_hours  -98.959     10.633  -9.307   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 424.6 on 36001 degrees of freedom
  (48 observations deleted due to missingness)
Multiple R-squared:  0.2147,    Adjusted R-squared:  0.2146 
F-statistic:  2461 on 4 and 36001 DF,  p-value: < 2.2e-16
# adding output to a table
tab_model(model_int_tou_peak_demand,
          pred.labels = c("Intercept", 
                          "TOU Policy In Effect", 
                          "During Peak Hours", 
                          "Max. Temp above 80 (°F)",
                          "TOU Policy & Peak Hours"),
          dv.labels = c("Hourly Electricity Demand (MWh)"),
          string.ci = "Conf. Int (95%)",
          string.p = "P-value",
          title = "Table 2. Linear Model Results for Predictors on Hourly Electricity Demand with an Interaction Addition",
          digits = 2)
Table 2. Linear Model Results for Predictors on Hourly Electricity Demand with an Interaction Addition
  Hourly Electricity Demand (MWh)
Predictors Estimates Conf. Int (95%) P-value
Intercept 2154.55 2147.77 – 2161.34 <0.001
TOU Policy In Effect -83.11 -93.56 – -72.66 <0.001
During Peak Hours -322.25 -335.15 – -309.35 <0.001
Max. Temp above 80 (°F) 409.12 396.50 – 421.73 <0.001
TOU Policy & Peak Hours -98.96 -119.80 – -78.12 <0.001
Observations 36006
R2 / R2 adjusted 0.215 / 0.215

Classical Decomposition

x = seq(from = ymd('2018-07-1'), 
        length.out = 1481,
        by='day')

# preparing the dataframe for the time series
decom_df <- energy_temp_df |>
  group_by(date) |>
  summarize(daily_energy_mwh = sum(hourly_energy_mwh)) |> 
  mutate(index = x)

decom_ts <- as_tsibble(decom_df, index = index)

# conducting the classical decomposition and plotting it - season is 365 days
decom_plot_annual <- model(decom_ts, 
                    classical_decomposition(daily_energy_mwh ~ 
                                              season(365), 
                                            type = "additive")) |> 
  components() |> 
  autoplot(col = "#3d405b") +
  theme_minimal() +
  labs(title = "Classical Decomposition Model",
       subtitle = "Seasonality defined as 365 days",
       x = "Date",
       caption = "Figure 4")

# conducting the classical decomposition and plotting it - season is 30 days 
decom_plot_monthly <- model(decom_ts, 
                    classical_decomposition(daily_energy_mwh ~ 
                                              season(30), 
                                            type = "additive")) |> 
  components() |> 
  autoplot(col = "#3d405b") +
  theme_minimal() +
  labs(title = "Classical Decomposition Model",
       subtitle = "Seasonality defined as 30 days",
       x = "Date",
       caption = "Figure 3")

# stacking the two plots
ggarrange(decom_plot_monthly, decom_plot_annual,
                    ncol = 1, nrow = 2)